***** Is Europe an optimal Political Area?                *****
***** Alberto Alesina, Guido Tabellini & Francesco Trebbi *****

***** Section I.C

clear all
set mem 10g
set matsize 800
set more off

********************************************************************************
*Prep. Population data
use "data/Population.dta", clear
keep if year<=2009
save "data/Population.dta", replace

keep if year==1980
gen ln_pop = log(pop)
keep NUTS_id ln_pop	
preserve
rename NUTS_id ROW
rename ln_pop row_ln_pop
sort ROW
save "pop_row.dta", replace
restore
rename NUTS_id COL
rename ln_pop col_ln_pop
sort COL
save "pop_col.dta", replace

********************************************************************************
*Prep. Distance data
use "data/NUTS_3_coordinates.dta", clear
gen const = 1
rename lon lon2
rename lat lat2
rename NUTS_id COL
sort const
save "tmp0.dta", replace
use "data/NUTS_3_coordinates.dta", clear
rename NUTS_id ROW
gen const =1
joinby const using "tmp0.dta"
geodist lat lon lat2 lon2, gen(distance)
gen ln_distance = log(distance)
sort ROW COL
save "pairwise_NUTS_id_distance.dta", replace
erase "tmp0.dta"

********************************************************************************
**Programs for unsupervised learning:
**Dimensionality Reduction (PCA, MDS); Partitioning Cluster Analysis (SC)
********************************************************************************
*Program for eigengap plot in spectral clustering (SC)
program spectral_clustering_plot
	preserve
	clear
	scalar nn = colsof(A)
	global nn = nn
	set obs $nn
	matrix W = A
	*Rescaling and hollowing of similarity matrix (notice linear correlations are rescaled to fall in [0,1] instead of [-1,1])
	forvalues i = 1/$nn {
		forvalues j = 1/$nn {
			*correlation of at least 35% is required
			if A[`i',`j'] < .35 { 
				matrix W[`i',`j'] = 0 
			}
		}
	}
	matrix gamma_h = W - I(nn)
	matsum(gamma_h), r(b)
	matrix D = diag(b)
	*Graph Laplacian L
	matrix L = D - gamma_h
	*Eigenvalues of L 
	matrix symeigen X v = L
	matrix v = v'
	svmat2 double v
	sort v
	*Make & plot eigengaps
	gen Eigengap = v - v[_n-1]
	rename v Eigenvalue
	gen Number = _n
	*For readability
	drop if _n >= 20
	* Sort eigenvalues from smallest to largest and find which ones are the k smallest 
	*(dimension of null spae - eigenvalues close to zero) before the jump up
	twoway (line Eigenvalue Number, lw(thick) lp(solid)), title("Eigenvalue" "SC approach")
	restore
end
********************************************************************************
*Program for eigengap plot in principal component analysis (PCA)
program pca_plot
	preserve
	keep DY*
	qui pca DY*
	matrix v = e(Ev)
	matrix v = v'
	clear
	svmat2 double v
	*Make & plot eigengaps
	gen Eigengap = v[_n-1] - v
	rename v Eigenvalue
	gen Number = _n
	*For readability
	drop if _n >= 20
	* Sort eigenvalues from largest to smallest and find which ones are the k largest before the drop (elbow rule)
	twoway (line Eigenvalue Number, lw(thick) lp(solid)), title("Eigenvalue" "PCA approach")
	restore
end
********************************************************************************
*Program for plot in multidimensional scaling (MDS)
program mds_plot
	preserve
	keep id DY*
	matrix v1 = J(1,1,.)
	matrix v2 = v1
	* Note: Classical multidimensional scaling analysis with euclidean distance [option: measure(L2) std] is equivalent
	* to PCA and MDS coordinates are comparable to the principal components. below I use the Manhattan distance instead.
	forvalues i = 1(1)10 {
		qui mds DY*, id(id) measure(L1) dimension(`i') noplot
		matrix v1 = (v1\e(mardia1))
		matrix v2 = (v2\e(mardia2))
		scalar y1 = e(np)
		scalar y2 = `i'
		if y2 >= y1 {
			di "Attention break in loop as dim() will exceed number of positive eigenvalues of the centered dissimilarity matrix"
			continue, break
		}
	}
	clear
	svmat2 double v1
	svmat2 double v2
	*Make & plot Mardia measures 1 & 2
	drop if _n==1
	gen Mardia1 = v1
	gen Mardia2 = v2
	gen Number = _n
	*For readability
	gen Number2 = Number + 0.1
	*For readability
	drop if _n >= 10
	lab var Mardia1 "Mardia 1"
	lab var Mardia2 "Mardia 2"
	* Ratios of sum eigenvalues (or squared eigenvalues) in the p dimensions relative to the same sum but for total n dimensions 
	twoway (spike Mardia1 Number, lw(thick) lp(solid)) (spike Mardia2 Number2, lw(thick) lp(dash)), title("Mardia statistics" "MDS approach")
	restore
end
********************************************************************************

*****Generate correlation plots & clustering plots
use "data/GDP", clear
gen Y = gdp

****************************************************************************
*Country are the first two letters in the NUTS id
gen country = substr(NUTS_id,1,2)
gen NUTS_1_id = substr(NUTS_id,1,3)
gen NUTS_2_id = substr(NUTS_id,1,4)
*Making sure higher aggregations are dropped
drop if country==NUTS_id|NUTS_1_id==NUTS_id|NUTS_2_id==NUTS_id
*Removing Eastern Europe (countries with data from 1991 onwards)
drop if country == "BG" | country == "CZ" | country == "EE"| country == "CY"
drop if country == "LV" | country == "LT" | country == "HU"| country == "MT"
drop if country == "LV" | country == "LT" | country == "HU"| country == "MT"
drop if country == "RO" | country == "SI" | country == "SK"| country == "PL"
*Drop regions that do not have full sample
drop if Y==.
bysort NUTS_id (year): drop if _N < 30
drop country NUTS_1_id NUTS_2_id
*Growth rates
bysort NUTS_id (year): gen DY=100*(Y/Y[_n-1] - 1)
*Reshapes 
*panel format
keep DY year NUTS_id
save "tmp0.dta", replace
*year x region
preserve
reshape wide DY, i(year) j(NUTS_id) string
save "tmp1.dta", replace
restore
*region x year
reshape wide DY, i(NUTS_id) j(year)
save "tmp1_1.dta", replace
****************************************************************************

**Pre 1999 sample
**MDS
use "tmp1.dta", clear
keep if year<1999
keep year DY*
gen id = year

**PCA
pca_plot
graph save "section1/fig3_pca_pre_GDP", replace
*
use "tmp1.dta", clear
keep if year<1999
drop year
qui corr DY*
*correlation matrix of growth rates across all region pairs
matrix A = r(C)
**SC
spectral_clustering_plot
graph save "section1/fig3_sc_pre_GDP", replace

****************************************************************************
*Now making room for the matrix of correlations
clear
svmat2 double A, name(col) r(ROW)
replace ROW = regexr(ROW, "DY","")
*reshape region x region correlation matrix into vector (making sure we keep n*(n-1)/2)
reshape long DY, i(ROW) j(COL) string
save "GDP_pre.dta", replace
*Keeping unique correlations 
drop if DY==1
bysort DY (ROW): drop if _n==2
rename DY DY_correlation
label var DY "Regional correlations in `file'"
*Identifying countries
gen country_row = substr(ROW,1,2)
gen country_col = substr(COL,1,2)
gen same = (country_row==country_col)
*Merge with population and pairwise distance measure
sort ROW
merge ROW using "pop_row.dta"
tab _m
keep if _m==3
drop _m
sort COL
merge COL using "pop_col.dta"
tab _m
keep if _m==3
drop _m
gen pop_weight = row_ln_pop+col_ln_pop
sort ROW COL
merge ROW COL using "pairwise_NUTS_id_distance.dta"
tab _m
keep if _m==3
drop _m
keep ROW COL DY
rename DY lDY
sort ROW COL
save "tmp2.dta", replace
*since ROW & COL can swap I need this trick to make the merge below possible
rename COL C
rename ROW COL
rename C ROW
rename lDY lDY_temp
sort ROW COL
save "tmp2_1.dta", replace
****************************************************************************
**Post 1999 sample
**MDS
use "tmp1.dta", clear
keep if year>=1999
keep year DY*
gen id = year
**PCA
pca_plot
graph save "section1/fig3_pca_post_GDP", replace
*
use "tmp1.dta", clear
keep if year>=1999
drop year
qui corr DY*
matrix A = r(C)
**SC
spectral_clustering_plot
graph save "section1/fig3_sc_post_GDP", replace
****************************************************************************

*Now making room for the matrix of correlations
clear
svmat2 double A, name(col) r(ROW)
replace ROW = regexr(ROW, "DY","")
reshape long DY, i(ROW) j(COL) string
save "GDP_post.dta", replace
*Keeping unique correlations 
drop if DY==1
bysort DY (ROW): drop if _n==2
rename DY DY_correlation
label var DY "Regional correlations in GDP"
*Identifying countries
gen country_row = substr(ROW,1,2)
gen country_col = substr(COL,1,2)
gen same = (country_row==country_col)
*Periphery & core
gen periphery_row = (country_row=="GR"|country_row=="IT"|country_row=="ES"|country_row=="PT"|country_row=="IE")
gen core_row = (country_row=="DE"|country_row=="AT"|country_row=="FR"|country_row=="NL"|country_row=="BE"|country_row=="LU"|country_row=="FI")
gen periphery_col = (country_col=="GR"|country_col=="IT"|country_col=="ES"|country_col=="PT"|country_col=="IE")
gen core_col = (country_col=="DE"|country_col=="AT"|country_col=="FR"|country_col=="NL"|country_col=="BE"|country_col=="LU"|country_col=="FI")
gen sample2 = ((periphery_row==1|core_row==1)&(periphery_col==1|core_col==1))
gen corecore = (core_row==1&core_col==1)
gen perper = (periphery_row==1&periphery_col==1)
gen coreper = (sample2==1&corecore==0&perper==0)
*EMU & Not-EMU
gen emu_row = (country_row=="DE"|country_row=="AT"|country_row=="FR"|country_row=="NL"|country_row=="BE"|country_row=="LU"|country_row=="FI"|country_row=="GR"|country_row=="IT"|country_row=="ES"|country_row=="PT"|country_row=="IE")
gen nemu_row = (country_row=="DK"|country_row=="SE"|country_row=="UK"|country_row=="NO")
gen emu_col = (country_col=="DE"|country_col=="AT"|country_col=="FR"|country_col=="NL"|country_col=="BE"|country_col=="LU"|country_col=="FI"|country_col=="GR"|country_col=="IT"|country_col=="ES"|country_col=="PT"|country_col=="IE")
gen nemu_col = (country_col=="DK"|country_col=="SE"|country_col=="UK"|country_col=="NO")
gen sample3 = ((emu_row==1|nemu_row==1)&(emu_col==1|nemu_col==1))
gen emuemu = (emu_row==1&emu_col==1)
gen nemunemu = (nemu_row==1&nemu_col==1)
gen nemuemu = (sample3==1&emuemu==0&nemunemu==0)
*
*Merge with population and pairwise distance measure
sort ROW
merge ROW using "pop_row.dta"
tab _m
keep if _m==3
drop _m
sort COL
merge COL using "pop_col.dta"
tab _m
keep if _m==3
drop _m
gen pop_weight = row_ln_pop+col_ln_pop
sort ROW COL
merge ROW COL using "pairwise_NUTS_id_distance.dta"
tab _m
keep if _m==3
drop _m
****************************************************************************
*Link pre and post region correlations
sort ROW COL
merge ROW COL using "tmp2.dta"
di "Check this"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp2_1.dta"
di "  "
di "and check this"
tab _m
drop _m
replace lDY = lDY_temp if lDY == .
gen diffDY = DY-lDY
keep if diffDY~=.
lab variable diffDY "Change in pairwise GDP correlation" 
****************************************************************************
*Country pairs for specific country pairs
gen country1 = substr(ROW,1,2)
codebook country1 
gen country2 = substr(COL,1,2)
codebook country2
egen cc = group(country1 country2)
bysort cc: egen mean_diffDY = mean(diffDY)
bysort cc: replace mean_diffDY= . if _n>1
save "tmp3.dta", replace
****************************************************************************

* Figure 3
use "tmp3.dta", clear
kdensity diffDY if same==1, addplot(kdensity diffDY if same==0, lw(medthick) lp(solid)) lw(medthick) lp(dash) legend(lab(1 "Same country") /*
*/ label(2 "Different countries")) title("GDP growth correlations:" "Within NUTS3 pair change between 1980-98 & 99-2009")
graph export "section1/figure_3.png", replace
***

* Figure 4
use "tmp3.dta", clear
kdensity diffDY if sample3==1&nemunemu==1&same==0, addplot(kdensity diffDY if sample3==1&nemuemu==1&same==0, lw(medthick) lp(longdash) xlab(-2(1)2) ylab(0(0.5)1)|| kdensity diffDY if sample3==1&emuemu==1&same==0, lw(medthick) lp(solid) ) /*
*/ lw(medthick) lp(dash) legend(lab(1 "Not EMU-Not EMU") lab(2 "EMU-Not EMU") lab(3 "EMU-EMU")) title("GDP growth correlations:" "Within NUTS3 pairs change between 1980-98 & 99-2009") note("Only NUTS3 pairs belonging to different countries.")
graph export "section1/figure_4.png", replace
***

* Figure 5
clear
graph use "section1/fig3_sc_pre_GDP" 
serset dir
serset use
rename Eigenvalue Eigenvalue_sc_pre
sort Number
save "t1.dta",replace
clear
graph use "section1/fig3_sc_post_GDP" 
serset dir
serset use
rename Eigenvalue Eigenvalue_sc_post
sort Number
save "t2.dta",replace
clear
graph use "section1/fig3_pca_pre_GDP" 
serset dir
serset use
rename Eigenvalue Eigenvalue_pca_pre
sort Number
save "t3.dta",replace
clear
graph use "section1/fig3_pca_post_GDP" 
serset dir
serset use
rename Eigenvalue Eigenvalue_pca_post
sort Number
save "t4.dta",replace
use "t1.dta", clear
merge Number using "t2.dta"
drop _m
sort Number
merge Number using "t3.dta"
drop _m
sort Number
merge Number using "t4.dta"
drop _m
keep if Number<7
lab var Number "Eigenvalue Rank"
twoway (line Eigenvalue_sc_pre Eigenvalue_sc_post Number, lw(thick thick) lp(solid dash)), title("Eigenvalue Scree Plot" "Spectral Clustering approach") legend(off)
graph save "section1/fig3_sc_GDP", replace
twoway (line Eigenvalue_pca_pre Eigenvalue_pca_post Number, lw(thick thick) lp(solid dash)), title("Eigenvalue Scree Plot" "Principal Component approach") legend(off)
graph save "section1/fig3_pca_GDP", replace
graph combine "section1/fig3_pca_GDP" "section1/fig3_sc_GDP", r(1) c(2) title("Estimated Clusters" "EU 1980-98 (Solid) vs. EU 99-2009 (Dash)")
graph export "section1/figure_5.png", replace as(png)
***

* Figure A.2
use "tmp3.dta", clear
kdensity diffDY if sample2==1&perper==1&same==0, addplot(kdensity diffDY if sample2==1&coreper==1&same==0, lw(medthick) lp(longdash) || kdensity diffDY if sample2==1&corecore==1&same==0, lw(medthick) lp(solid) ) /*
*/ lw(medthick) lp(dash) legend(lab(1 "Periphery-Periphery") lab(2 "Core-Periphery") lab(3 "Core-Core")) title("GDP growth correlations:" "Within NUTS3 pairs change between 1980-98 & 99-2009") note("Only NUTS3 pairs belonging to different countries.")
graph export "appendix/figure_A2.png", replace as(png)
***

****************************************************************************
*Housekeeping
foreach file in "tmp0.dta" "tmp1.dta" "tmp1_1.dta" "tmp2.dta" "tmp2_1.dta" "tmp3.dta" "t1.dta" "t2.dta" "t3.dta" "t4.dta" {
erase "`file'"
}
foreach file in fig3_pca_pre_GDP fig3_sc_pre_GDP  fig3_pca_post_GDP fig3_sc_post_GDP fig3_sc_GDP fig3_pca_GDP{
erase "section1/`file'.gph"
}
foreach file in GDP_post.dta GDP_pre.dta pairwise_NUTS_id_distance.dta pop_col.dta pop_row.dta {
erase "`file'"
}



